Recall the mtcars dataset we work with before, which compirses fuel consumption and other aspects of design and performance for 32 cars from 1974. The dataset has 11 dimensions, that is more than it is possible to visualize at the same.
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
prcomp() to compute a PCA for mtcars. Remember to set the scale parameter, as the variables are in different units and have different rangesmtcars.pca <- prcomp(mtcars, scale=TRUE)
plot(mtcars.pca)
eig <- mtcars.pca$sdev^2
(var.exp <- 100*eig/sum(eig))
[1] 60.0763659 24.0951627 5.7017934 2.4508858 2.0313737 1.9236011
[7] 1.2296544 1.1172858 0.7004241 0.4730495 0.2004037
biplot(mtcars.pca, cex = 1.5)
mtcars.pca$rotation
PC1 PC2 PC3 PC4 PC5 PC6
mpg -0.3625305 0.01612440 -0.22574419 -0.022540255 0.10284468 -0.10879743
cyl 0.3739160 0.04374371 -0.17531118 -0.002591838 0.05848381 0.16855369
disp 0.3681852 -0.04932413 -0.06148414 0.256607885 0.39399530 -0.33616451
hp 0.3300569 0.24878402 0.14001476 -0.067676157 0.54004744 0.07143563
drat -0.2941514 0.27469408 0.16118879 0.854828743 0.07732727 0.24449705
wt 0.3461033 -0.14303825 0.34181851 0.245899314 -0.07502912 -0.46493964
qsec -0.2004563 -0.46337482 0.40316904 0.068076532 -0.16466591 -0.33048032
vs -0.3065113 -0.23164699 0.42881517 -0.214848616 0.59953955 0.19401702
am -0.2349429 0.42941765 -0.20576657 -0.030462908 0.08978128 -0.57081745
gear -0.2069162 0.46234863 0.28977993 -0.264690521 0.04832960 -0.24356284
carb 0.2140177 0.41357106 0.52854459 -0.126789179 -0.36131875 0.18352168
PC7 PC8 PC9 PC10 PC11
mpg 0.367723810 -0.754091423 0.235701617 0.13928524 -0.124895628
cyl 0.057277736 -0.230824925 0.054035270 -0.84641949 -0.140695441
disp 0.214303077 0.001142134 0.198427848 0.04937979 0.660606481
hp -0.001495989 -0.222358441 -0.575830072 0.24782351 -0.256492062
drat 0.021119857 0.032193501 -0.046901228 -0.10149369 -0.039530246
wt -0.020668302 -0.008571929 0.359498251 0.09439426 -0.567448697
qsec 0.050010522 -0.231840021 -0.528377185 -0.27067295 0.181361780
vs -0.265780836 0.025935128 0.358582624 -0.15903909 0.008414634
am -0.587305101 -0.059746952 -0.047403982 -0.17778541 0.029823537
gear 0.605097617 0.336150240 -0.001735039 -0.21382515 -0.053507085
carb -0.174603192 -0.395629107 0.170640677 0.07225950 0.319594676
ggfortify and use autoplot on prcomp objects to generate easily ggplots.# install.packages("ggfortify")
library(ggfortify)
library(dplyr)
mtcars <- mtcars %>%
mutate(am = factor(am), gear = factor(gear), cyl = factor(cyl))
autoplot(mtcars.pca, data = mtcars, label = TRUE, label.size = 4, colour = 'cyl')
autoplot(mtcars.pca, data = mtcars, label = TRUE, label.size = 4, colour = 'gear')
autoplot(mtcars.pca, data = mtcars, label = FALSE, colour = 'am',
loadings.label = TRUE, loadings.label.size = 4)
autoplot(mtcars.pca, size = 0, label = TRUE, label.size = 4,
loadings = TRUE, loadings.label = TRUE,
loadings.label.size = 5) + theme_classic()
We will generate synthetic clustered data to use for k-means clustering.
set.seed(489576)
N <- 1000
C1 <- data.frame(cluster = "C1", x = rnorm(n = N, mean = 1), y = rnorm(n = N, mean = 1))
C2 <- data.frame(cluster = "C2", x = rnorm(n = N, mean = -2), y = rnorm(n = N, mean = -5))
C3 <- data.frame(cluster = "C3", x = rnorm(n = N, mean = 5), y = rnorm(n = N, mean = 1))
DF <- rbind(C1, C2, C3)
ggplot(DF, aes(x, y, color = cluster)) +
geom_point()
kmeans.res <- kmeans(x = DF[, -1], centers = 3)
kmeans.res$centers
x y
1 0.9614141 1.0058486
2 -2.0431190 -4.9932291
3 5.0098346 0.9780214
table(kmeans = kmeans.res$cluster, true = DF$cluster)
true
kmeans C1 C2 C3
1 968 1 23
2 0 999 0
3 32 0 977
library(ggplot2)
DF$kmeans <- factor(kmeans.res$cluster)
ggplot(DF, aes(x, y)) +
geom_point(alpha = 0.5, aes( color = kmeans)) +
geom_point(data = data.frame(x = kmeans.res$centers[, 1],
y = kmeans.res$centers[, 2]), size = 3, aes(x, y), color = "Black")
kmeans.res2 <- kmeans(x = DF[, -1], centers = 4)
kmeans.res2$centers
x y kmeans
1 5.3301476 1.7351006 3
2 4.7001362 0.2460306 3
3 0.9614141 1.0058486 1
4 -2.0431190 -4.9932291 2
DF$kmeans2 <- factor(kmeans.res2$cluster)
ggplot(DF, aes(x, y, color = kmeans2)) +
geom_point(alpha = 0.5)
In this exercise you will you use a dataset published in a study by Khan et al. 2001 to perform a hierarchical clustering of the patients in the study based on their overall gene expression data.
This data set consists of expression levels for 2,308 genes. The training and test sets consist of 63 and 20 observations (tissue samples) respectively.
Here, we will use the train set, as we now are only interested in learning how hclust() works. First, load the ISLR where the data is available. The gene expression data is available in an object Khan$xtrain; you can learn more about the data set by typing in ?Khan after loading ISLR package.
library(ISLR)
gene.expression <- Khan$xtrain
dim(gene.expression)
[1] 63 2308
D <- dist(gene.expression)
khan.hclust <- hclust(D, method = "average")
Khan$ytrain, check if the clustering performed groups the observations from same tumor class nearby.plot(khan.hclust, labels = Khan$ytrain)
The files are data on the 28x28 pixel images of digits (0-9). The data is composed of:
label column denoting the digit on the imagepixel0 through pixel783 contain information on the pixel intensity (on the scale of 0-255), and together form the vectorized version of the 28x28 pixel digit imageDownload the data from the course repository:
# load the already subsetted MNIST data.
mnist.url <- "https://github.com/cme195/cme195.github.io/raw/master/assets/data/mnist_small.csv"
train <- read.csv(mnist.url, row.names = 1)
dim(train)
[1] 1000 785
train[1:10, 1:10]
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8
4776 5 0 0 0 0 0 0 0 0 0
26136 0 0 0 0 0 0 0 0 0 0
25589 7 0 0 0 0 0 0 0 0 0
26181 0 0 0 0 0 0 0 0 0 0
36156 9 0 0 0 0 0 0 0 0 0
26890 3 0 0 0 0 0 0 0 0 0
399 4 0 0 0 0 0 0 0 0 0
9766 1 0 0 0 0 0 0 0 0 0
27971 2 0 0 0 0 0 0 0 0 0
21594 6 0 0 0 0 0 0 0 0 0
# compare with pca
pca <- prcomp(train[,-1])
coord.pca <- data.frame(pca$x[, 1:2])
coord.pca$label <- factor(train$label)
ggplot(coord.pca, aes(x= PC1, y = PC2)) + ggtitle("PCA") +
geom_text(aes(label = label, color = label), alpha = 0.8)
# Use tsne
library(Rtsne)
set.seed(123) # for reproducibility
tsne <- Rtsne(train[,-1], dims = 2, perplexity=30,
verbose=FALSE, max_iter = 500)
coord.tsne <- data.frame(tsne$Y)
coord.tsne$label <- factor(train$label)
ggplot(coord.tsne, aes(x= X1, y = X2)) + ggtitle("tSNE") +
geom_text(aes(label = label, color = label), alpha = 0.8)
tSNE seems to be much better at separating digits from each other
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] Rtsne_0.13 ggfortify_0.4.4 randomForest_4.6-14
[4] ISLR_1.2 forcats_0.3.0 stringr_1.3.1
[7] dplyr_0.7.99.9000 purrr_0.2.5 readr_1.1.1
[10] tidyr_0.8.1 tibble_1.4.2 ggplot2_3.0.0.9000
[13] tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] tidyselect_0.2.4 haven_1.1.2 lattice_0.20-35 colorspace_1.3-2
[5] htmltools_0.3.6 yaml_2.2.0 base64enc_0.1-3 utf8_1.1.4
[9] rlang_0.2.2.9002 pillar_1.3.0 glue_1.3.0 withr_2.1.2
[13] modelr_0.1.2 readxl_1.1.0 plyr_1.8.4 munsell_0.5.0
[17] gtable_0.2.0 cellranger_1.1.0 rvest_0.3.2 evaluate_0.11
[21] labeling_0.3 knitr_1.20 fansi_0.2.3 broom_0.5.0
[25] Rcpp_0.12.19.2 scales_1.0.0 backports_1.1.2 jsonlite_1.5
[29] gridExtra_2.3 hms_0.4.2 digest_0.6.17 stringi_1.2.4
[33] grid_3.5.0 rprojroot_1.3-2 cli_1.0.0 tools_3.5.0
[37] magrittr_1.5 lazyeval_0.2.1 crayon_1.3.4 pkgconfig_2.0.2
[41] xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.0 rmarkdown_1.10
[45] httr_1.3.1 rstudioapi_0.7 R6_2.2.2 nlme_3.1-137
[49] compiler_3.5.0